import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


df = pd.read_csv('../DATA/heart.csv')


df.head()


df['target'].unique()

array([1, 0], dtype=int64)


# CODE HERE
df.isnull().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       303 non-null    bool 
 1   sex       303 non-null    bool 
 2   cp        303 non-null    bool 
 3   trestbps  303 non-null    bool 
 4   chol      303 non-null    bool 
 5   fbs       303 non-null    bool 
 6   restecg   303 non-null    bool 
 7   thalach   303 non-null    bool 
 8   exang     303 non-null    bool 
 9   oldpeak   303 non-null    bool 
 10  slope     303 non-null    bool 
 11  ca        303 non-null    bool 
 12  thal      303 non-null    bool 
 13  target    303 non-null    bool 
dtypes: bool(14)
memory usage: 4.3 KB


# CODE HERE
df.describe().transpose()


# CODE HERE!
sns.countplot(data=df,x='target')

<AxesSubplot:xlabel='target', ylabel='count'>


# CODE HERE
cp=df[['age','trestbps', 'chol','thalach','target']]
sns.pairplot(data=cp,hue='target')

<seaborn.axisgrid.PairGrid at 0x1870783ac08>


# CODE HERE
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True,cmap='viridis')

<AxesSubplot:>


# CODE HERE
X=df.drop('target',axis=1)


y=df['target']


# CODE HERE
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=101)


# CODE HERE
from sklearn.preprocessing import StandardScaler


scaler=StandardScaler()


x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)


# CODE HERE
from sklearn.linear_model import LogisticRegression


model=LogisticRegression()


from sklearn.model_selection import GridSearchCV


penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
grid_model=GridSearchCV(model,param_grid={'C':C,'penalty':penalty})


grid_model.fit(x_train,y_train)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\linear_model\_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [       nan 0.82363636        nan 0.83104377        nan 0.82734007
        nan 0.82734007        nan 0.82734007        nan 0.82734007
        nan 0.82734007        nan 0.82734007        nan 0.82734007
        nan 0.82734007]
  category=UserWarning,

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2']})


# CODE HERE
grid_model.best_params_

{'C': 2.7825594022071245, 'penalty': 'l2'}


grid_model.best_estimator_.coef_

array([[-0.06862347, -0.76677567,  0.92401506, -0.27433714, -0.22673577,
         0.04684481,  0.12315594,  0.44657231, -0.43416162, -0.53866102,
         0.39453632, -0.88123288, -0.58989011]])


label=df.drop('target',axis=1).columns


df=pd.DataFrame(columns=label,data=grid_model.best_estimator_.coef_)

df


plt.figure(figsize=(12,10))
sns.barplot(df)

<AxesSubplot:>


# CODE HERE
from sklearn.metrics import plot_confusion_matrix,confusion_matrix,classification_report


y_pred=grid_model.predict(x_test)


confusion_matrix(y_test,y_pred)

array([[12,  3],
       [ 2, 14]], dtype=int64)


# CODE HERE
# that means we have predicted 12 true negative , 2 as false nehative , 3 as false positive 12 as true positive 
plot_confusion_matrix(grid_model,x_test,y_test)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1870c2d4a48>


# CODE HERE
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.82      0.88      0.85        16

    accuracy                           0.84        31
   macro avg       0.84      0.84      0.84        31
weighted avg       0.84      0.84      0.84        31


# CODE HERE
from sklearn.metrics import plot_roc_curve , plot_precision_recall_curve
plot_precision_recall_curve(grid_model,x_test,y_test)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_precision_recall_curve is deprecated; Function `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: PrecisionRecallDisplay.from_predictions or PrecisionRecallDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x1870c2d7308>


# CODE HERE
plot_roc_curve(grid_model,x_test,y_test)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_roc_curve is deprecated; Function :func:`plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:`sklearn.metric.RocCurveDisplay.from_predictions` or :meth:`sklearn.metric.RocCurveDisplay.from_estimator`.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1870c37b848>


patient = [[ 54. ,   1. ,   0. , 122. , 286. ,   0. ,   0. , 116. ,   1. ,
          3.2,   1. ,   2. ,   2. ]]


# EXPECTED PREDICTION

grid_model.predict(patient)

array([0], dtype=int64)


# EXPECTED PROBABILITY PER CLASS (Basically model should be extremely sure its in the 0 class)
grid_model.predict_proba(patient)

array([[1.00000000e+00, 7.74916259e-25]])

	count	mean	std	min	25%	50%	75%	max
age	303.0	54.366337	9.082101	29.0	47.5	55.0	61.0	77.0
sex	303.0	0.683168	0.466011	0.0	0.0	1.0	1.0	1.0
cp	303.0	0.966997	1.032052	0.0	0.0	1.0	2.0	3.0
trestbps	303.0	131.623762	17.538143	94.0	120.0	130.0	140.0	200.0
chol	303.0	246.264026	51.830751	126.0	211.0	240.0	274.5	564.0
fbs	303.0	0.148515	0.356198	0.0	0.0	0.0	0.0	1.0
restecg	303.0	0.528053	0.525860	0.0	0.0	1.0	1.0	2.0
thalach	303.0	149.646865	22.905161	71.0	133.5	153.0	166.0	202.0
exang	303.0	0.326733	0.469794	0.0	0.0	0.0	1.0	1.0
oldpeak	303.0	1.039604	1.161075	0.0	0.0	0.8	1.6	6.2
slope	303.0	1.399340	0.616226	0.0	1.0	1.0	2.0	2.0
ca	303.0	0.729373	1.022606	0.0	0.0	0.0	1.0	4.0
thal	303.0	2.313531	0.612277	0.0	2.0	2.0	3.0	3.0
target	303.0	0.544554	0.498835	0.0	0.0	1.0	1.0	1.0

Logistic Regression Project Exercise¶

Imports¶

Data¶

Exploratory Data Analysis and Visualization¶

Visualization Tasks¶

Machine Learning¶

Train | Test Split and Scaling¶

Logistic Regression Model¶

Coeffecients¶

Model Performance Evaluation¶

Performance Curves¶

Great Job!¶

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1